Linear Regression in OLS

Author
Affiliation

Dave Clark

Binghamton University

Published

January 20, 2025

The Linear OLS Model

Introduction

This document demonstrates the process of linear regression, starting with data simulation, estimation using R’s lm() function, and then replicating the process using matrix algebra. We’ll work with both simulated and real-world data (Boston housing dataset).

Simulated Data Example

We’ll simulate data based on the following model:

y = b_0 + b_1*x1 + b_2*x2 + error

where:

  • y is a continuous, positive outcome variable.
  • x1 is a binary variable.
  • x2 is a continuous variable.
set.seed(8675309)  # For reproducibility
n <- 100 # Number of observations

# Simulate data
x1 <- rbinom(n, 1, 0.5)
x2 <- rnorm(n, 10, 2)
error <- rnorm(n, 0, 3)  # Error term
b0 <- 5
b1 <- 2
b2 <- -0.5
y <- b0 + b1 * x1 + b2 * x2 + error

sim_data <- data.frame(y, x1, x2)

Estimation with lm()

model_lm <- lm(y ~ x1 + x2, data = sim_data)
summary(model_lm)

Call:
lm(formula = y ~ x1 + x2, data = sim_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.0134 -2.2345  0.3141  2.3522  8.4468 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.7208     1.8970   1.961   0.0527 .  
x1            2.7952     0.6710   4.166 6.73e-05 ***
x2           -0.4278     0.1837  -2.329   0.0219 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.353 on 97 degrees of freedom
Multiple R-squared:  0.1874,    Adjusted R-squared:  0.1707 
F-statistic: 11.19 on 2 and 97 DF,  p-value: 4.254e-05
# Create a table of coefficients
lm_coef_table <- knitr::kable(coef(summary(model_lm)), digits = 3, caption = "Coefficients from `lm()`")
lm_coef_table
Coefficients from lm()
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.721 1.897 1.961 0.053
x1 2.795 0.671 4.166 0.000
x2 -0.428 0.184 -2.329 0.022
code
#plot predictions in highcharter 

# Prediction Plot

pred_plot <- highchart() %>%
  hc_add_series(plot_data, "scatter", hcaes(x = x2, y = y), name = "Actual Data (x1=0)") %>%
  hc_add_series(data.frame(x2 = x2_pred, pred = predictions), "line", hcaes(x=x2, y = pred), name="Predicted") %>%
  hc_title(text = "Regression Predictions (lm)") %>%
  hc_xAxis(title = list(text = "x2")) %>%
  hc_yAxis(title = list(text = "y"))
pred_plot
code
# Residual Plot
resid_plot <- highchart() %>%
  hc_add_series(plot_data, "column", hcaes(x = x2, y = pred - y), name = "Residuals", color="grey") %>%
  hc_xAxis(title = list(text = "x2")) %>%
  hc_yAxis(title = list(text = "Residuals")) %>%
  hc_title(text = "Plot of Residuals (lm)")

resid_plot

Estimation with Matrix Algebra

# Create design matrix X and outcome vector y
X <- cbind(1, sim_data$x1, sim_data$x2)
y <- sim_data$y

# Calculate coefficients using matrix algebra
b <- solve(t(X) %*% X) %*% t(X) %*% y

# Calculate standard errors
residuals <- y - X %*% b
sigma_sq <- sum(residuals^2) / (n - ncol(X))
var_b <- sigma_sq * solve(t(X) %*% X)
se_b <- sqrt(diag(var_b))

# Create a table of coefficients
matrix_coef_table <- knitr::kable(data.frame(Estimate = b, `Std. Error` = se_b, row.names = c("Intercept", "x1", "x2")), digits = 3, caption = "Coefficients from Matrix Algebra")

matrix_coef_table
Coefficients from Matrix Algebra
Estimate Std..Error
Intercept 3.721 1.897
x1 2.795 0.671
x2 -0.428 0.184
# Predictions and plots (matrix)
X_pred <- cbind(1, x1_pred, x2_pred)
predictions_matrix <- X_pred %*% b

# Prediction Plot
pred_plot_matrix <- highchart() %>%
 hc_add_series(plot_data, "scatter", hcaes(x = x2, y = y), name = "Actual Data (x1=0)") %>%
  hc_add_series(data.frame(x2 = x2_pred, pred = predictions_matrix), "line", hcaes(x=x2, y = pred), name="Predicted") %>%
    hc_title(text = "Regression Predictions (Matrix)") %>%
  hc_xAxis(title = list(text = "x2")) %>%
  hc_yAxis(title = list(text = "y"))

pred_plot_matrix
resid_plot_matrix <- highchart() %>%
    hc_add_series(plot_data, "column", hcaes(x = x2, y = pred - y), name = "Residuals", color="grey") %>%
  hc_xAxis(title = list(text = "x2")) %>%
  hc_yAxis(title = list(text = "Residuals")) %>%
  hc_title(text = "Plot of Residuals (Matrix)")

resid_plot_matrix

Boston Housing Data Example

library(MASS)
data(Boston)

We estimate this model: medv ~ crim + age + tax + black + dis + ptratio

boston_model <- lm(medv ~ crim + age + tax + black + dis + ptratio, data = Boston)
summary(boston_model)

Call:
lm(formula = medv ~ crim + age + tax + black + dis + ptratio, 
    data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-16.671  -4.151  -1.315   2.270  33.143 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 58.773732   3.721963  15.791  < 2e-16 ***
crim        -0.140667   0.046639  -3.016 0.002691 ** 
age         -0.094768   0.017458  -5.428 8.89e-08 ***
tax         -0.007048   0.002845  -2.477 0.013576 *  
black        0.014560   0.003972   3.666 0.000273 ***
dis         -0.922736   0.238417  -3.870 0.000123 ***
ptratio     -1.519762   0.166843  -9.109  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.179 on 499 degrees of freedom
Multiple R-squared:  0.398, Adjusted R-squared:  0.3907 
F-statistic: 54.98 on 6 and 499 DF,  p-value: < 2.2e-16
boston_lm_coef_table <- knitr::kable(coef(summary(boston_model)), digits = 3, caption = "Boston Model Coefficients (lm)")
boston_lm_coef_table
Boston Model Coefficients (lm)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 58.774 3.722 15.791 0.000
crim -0.141 0.047 -3.016 0.003
age -0.095 0.017 -5.428 0.000
tax -0.007 0.003 -2.477 0.014
black 0.015 0.004 3.666 0.000
dis -0.923 0.238 -3.870 0.000
ptratio -1.520 0.167 -9.109 0.000
# Predictions for Boston data (lm)
central_tendencies <- apply(Boston[, c("crim", "age", "tax", "black", "dis")], 2, median)
ptratio_pred <- seq(min(Boston$ptratio), max(Boston$ptratio), length.out = 100)

boston_new_data <- data.frame(crim = central_tendencies["crim"], 
                          age = central_tendencies["age"],
                          tax = central_tendencies["tax"],
                          black = central_tendencies["black"],
                          dis = central_tendencies["dis"],
                          ptratio = ptratio_pred)

boston_predictions <- predict(boston_model, newdata = boston_new_data)

# Plotting data (using all observations for scatter)
plot_data_boston_lm <- Boston
plot_data_boston_lm$pred <- predict(boston_model)

# Prediction plot
boston_lm_pred_plot <- highchart() %>%
  hc_add_series(plot_data_boston_lm, "scatter", hcaes(x = ptratio, y = medv), name = "Actual Data") %>%
  hc_add_series(data.frame(ptratio = ptratio_pred, pred = boston_predictions), "line", hcaes(x=ptratio, y = pred), name="Predicted") %>%
  hc_title(text = "Boston Housing Predictions (lm)") %>%
  hc_xAxis(title = list(text = "ptratio")) %>%
  hc_yAxis(title = list(text = "medv"))

boston_lm_pred_plot
# Residuals plot
boston_lm_resid_plot <- highchart() %>%
 hc_add_series(plot_data_boston_lm, "column", hcaes(x = ptratio, y = pred - medv), name = "Residuals", color="grey") %>%
  hc_xAxis(title = list(text = "ptratio")) %>%
  hc_yAxis(title = list(text = "Residuals")) %>%
  hc_title(text = "Plot of Residuals (lm)")

boston_lm_resid_plot

Matrix Algebra with Boston Data

X_boston <- model.matrix(boston_model) 
y_boston <- Boston$medv

b_boston <- solve(t(X_boston) %*% X_boston) %*% t(X_boston) %*% y_boston

# ... (Calculate standard errors, t-values, p-values similarly to the simulated data example.  Include this code if needed).


boston_matrix_coef_table <- knitr::kable(data.frame(Estimate=b_boston), digits = 3, caption = "Boston Model Coefficients (Matrix)")

boston_matrix_coef_table
Boston Model Coefficients (Matrix)
Estimate
(Intercept) 58.774
crim -0.141
age -0.095
tax -0.007
black 0.015
dis -0.923
ptratio -1.520
# Predictions for Boston data (Matrix)
X_boston_pred <- cbind(1, central_tendencies["crim"], central_tendencies["age"], central_tendencies["tax"],
                         central_tendencies["black"], central_tendencies["dis"], ptratio_pred)

boston_predictions_matrix <- X_boston_pred %*% b_boston

# Prediction Plot
boston_matrix_pred_plot <- highchart() %>%
  hc_add_series(plot_data_boston_lm, "scatter", hcaes(x = ptratio, y = medv), name = "Actual Data") %>%
  hc_add_series(data.frame(ptratio = ptratio_pred, pred = boston_predictions_matrix), "line", hcaes(x = ptratio, y = pred), name = "Predicted") %>%
  hc_title(text = "Boston Housing Predictions (Matrix)") %>%
  hc_xAxis(title = list(text = "ptratio")) %>%
  hc_yAxis(title = list(text = "medv"))

boston_matrix_pred_plot
# Residual plot (from lm predictions - for fair comparison)
boston_matrix_resid_plot <- highchart() %>%
  hc_add_series(plot_data_boston_lm, "column", hcaes(x = ptratio, y = pred - medv), name = "Residuals", color="grey") %>%
  hc_xAxis(title = list(text = "ptratio")) %>%
  hc_yAxis(title = list(text = "Residuals")) %>%
  hc_title(text = "Plot of Residuals (Matrix)")

boston_matrix_resid_plot

Conclusion

Back to top